library(data.table) # Efficient Dataframe
library(lubridate) # For Dates
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression Tree# Load the Dataset with Fread()
ToyotaDT <- fread("DATA/ToyotaCorolla.csv")# Preview of the Dataset
DT::datatable(head(ToyotaDT,2))Dataset Description: The file ToyotaCorolla.csv contains data on used cars on sale during late summer of 2004 in the Netherlands. It has 1436 records containing detail on 38 attributes, including Price, Age, Kilometers, HP (Horse Power), and other specifications.
dfSummary(ToyotaDT,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp")Dimensions: 1436 x 39
Duplicates: 0
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | Id [integer] |
Mean (sd) : 721.6 (416.5) min < med < max: 1 < 721.5 < 1442 IQR (CV) : 719.5 (0.6) |
1436 distinct values | 0 (0.0%) |
|
| 2 | Model [character] |
1. TOYOTA Corolla 1.6 16V HA 2. TOYOTA Corolla 1.3 16V HA 3. TOYOTA Corolla 1.6 16V LI 4. TOYOTA Corolla 1.6 16V LI 5. TOYOTA Corolla 1.6 16V SE 6. TOYOTA Corolla 1.4 16V VV 7. TOYOTA Corolla 1.3 16V LI 8. TOYOTA Corolla 1.6 16V VV 9. TOYOTA Corolla 1.6 16V WA 10. TOYOTA Corolla 1.6 16V VV [ 362 others ] |
107 ( 7.5%) 83 ( 5.8%) 79 ( 5.5%) 70 ( 4.9%) 43 ( 3.0%) 42 ( 2.9%) 35 ( 2.4%) 31 ( 2.2%) 28 ( 1.9%) 27 ( 1.9%) 891 (62.0%) |
0 (0.0%) |
|
| 3 | Price [integer] |
Mean (sd) : 10730.8 (3627) min < med < max: 4350 < 9900 < 32500 IQR (CV) : 3500 (0.3) |
236 distinct values | 0 (0.0%) |
|
| 4 | Age_08_04 [integer] |
Mean (sd) : 55.9 (18.6) min < med < max: 1 < 61 < 80 IQR (CV) : 26 (0.3) |
77 distinct values | 0 (0.0%) |
|
| 5 | Mfg_Month [integer] |
Mean (sd) : 5.5 (3.4) min < med < max: 1 < 5 < 12 IQR (CV) : 5 (0.6) |
12 distinct values | 0 (0.0%) |
|
| 6 | Mfg_Year [integer] |
Mean (sd) : 1999.6 (1.5) min < med < max: 1998 < 1999 < 2004 IQR (CV) : 3 (0) |
1998 : 392 (27.3%) 1999 : 441 (30.7%) 2000 : 225 (15.7%) 2001 : 192 (13.4%) 2002 : 87 ( 6.1%) 2003 : 75 ( 5.2%) 2004 : 24 ( 1.7%) |
0 (0.0%) |
|
| 7 | KM [integer] |
Mean (sd) : 68533.3 (37506.4) min < med < max: 1 < 63389.5 < 243000 IQR (CV) : 44020.8 (0.5) |
1263 distinct values | 0 (0.0%) |
|
| 8 | Fuel_Type [character] |
1. CNG 2. Diesel 3. Petrol |
17 ( 1.2%) 155 (10.8%) 1264 (88.0%) |
0 (0.0%) |
|
| 9 | HP [integer] |
Mean (sd) : 101.5 (15) min < med < max: 69 < 110 < 192 IQR (CV) : 20 (0.1) |
12 distinct values | 0 (0.0%) |
|
| 10 | Met_Color [integer] |
Min : 0 Mean : 0.7 Max : 1 |
0 : 467 (32.5%) 1 : 969 (67.5%) |
0 (0.0%) |
|
| 11 | Color [character] |
1. Beige 2. Black 3. Blue 4. Green 5. Grey 6. Red 7. Silver 8. Violet 9. White 10. Yellow |
3 ( 0.2%) 191 (13.3%) 283 (19.7%) 220 (15.3%) 301 (21.0%) 278 (19.4%) 122 ( 8.5%) 4 ( 0.3%) 31 ( 2.2%) 3 ( 0.2%) |
0 (0.0%) |
|
| 12 | Automatic [integer] |
Min : 0 Mean : 0.1 Max : 1 |
0 : 1356 (94.4%) 1 : 80 ( 5.6%) |
0 (0.0%) |
|
| 13 | CC [integer] |
Mean (sd) : 1576.9 (424.4) min < med < max: 1300 < 1600 < 16000 IQR (CV) : 200 (0.3) |
13 distinct values | 0 (0.0%) |
|
| 14 | Doors [integer] |
Mean (sd) : 4 (1) min < med < max: 2 < 4 < 5 IQR (CV) : 2 (0.2) |
2 : 2 ( 0.1%) 3 : 622 (43.3%) 4 : 138 ( 9.6%) 5 : 674 (46.9%) |
0 (0.0%) |
|
| 15 | Cylinders [integer] |
1 distinct value | 4 : 1436 (100.0%) | 0 (0.0%) |
|
| 16 | Gears [integer] |
Mean (sd) : 5 (0.2) min < med < max: 3 < 5 < 6 IQR (CV) : 0 (0) |
3 : 2 ( 0.1%) 4 : 1 ( 0.1%) 5 : 1390 (96.8%) 6 : 43 ( 3.0%) |
0 (0.0%) |
|
| 17 | Quarterly_Tax [integer] |
Mean (sd) : 87.1 (41.1) min < med < max: 19 < 85 < 283 IQR (CV) : 16 (0.5) |
13 distinct values | 0 (0.0%) |
|
| 18 | Weight [integer] |
Mean (sd) : 1072.5 (52.6) min < med < max: 1000 < 1070 < 1615 IQR (CV) : 45 (0) |
59 distinct values | 0 (0.0%) |
|
| 19 | Mfr_Guarantee [integer] |
Min : 0 Mean : 0.4 Max : 1 |
0 : 848 (59.1%) 1 : 588 (40.9%) |
0 (0.0%) |
|
| 20 | BOVAG_Guarantee [integer] |
Min : 0 Mean : 0.9 Max : 1 |
0 : 150 (10.4%) 1 : 1286 (89.6%) |
0 (0.0%) |
|
| 21 | Guarantee_Period [integer] |
Mean (sd) : 3.8 (3) min < med < max: 3 < 3 < 36 IQR (CV) : 0 (0.8) |
3 : 1274 (88.7%) 6 : 77 ( 5.4%) 12 : 73 ( 5.1%) 13 : 1 ( 0.1%) 18 : 1 ( 0.1%) 20 : 1 ( 0.1%) 24 : 4 ( 0.3%) 28 : 1 ( 0.1%) 36 : 4 ( 0.3%) |
0 (0.0%) |
|
| 22 | ABS [integer] |
Min : 0 Mean : 0.8 Max : 1 |
0 : 268 (18.7%) 1 : 1168 (81.3%) |
0 (0.0%) |
|
| 23 | Airbag_1 [integer] |
Min : 0 Mean : 1 Max : 1 |
0 : 42 ( 2.9%) 1 : 1394 (97.1%) |
0 (0.0%) |
|
| 24 | Airbag_2 [integer] |
Min : 0 Mean : 0.7 Max : 1 |
0 : 398 (27.7%) 1 : 1038 (72.3%) |
0 (0.0%) |
|
| 25 | Airco [integer] |
Min : 0 Mean : 0.5 Max : 1 |
0 : 706 (49.2%) 1 : 730 (50.8%) |
0 (0.0%) |
|
| 26 | Automatic_airco [integer] |
Min : 0 Mean : 0.1 Max : 1 |
0 : 1355 (94.4%) 1 : 81 ( 5.6%) |
0 (0.0%) |
|
| 27 | Boardcomputer [integer] |
Min : 0 Mean : 0.3 Max : 1 |
0 : 1013 (70.5%) 1 : 423 (29.5%) |
0 (0.0%) |
|
| 28 | CD_Player [integer] |
Min : 0 Mean : 0.2 Max : 1 |
0 : 1122 (78.1%) 1 : 314 (21.9%) |
0 (0.0%) |
|
| 29 | Central_Lock [integer] |
Min : 0 Mean : 0.6 Max : 1 |
0 : 603 (42.0%) 1 : 833 (58.0%) |
0 (0.0%) |
|
| 30 | Powered_Windows [integer] |
Min : 0 Mean : 0.6 Max : 1 |
0 : 629 (43.8%) 1 : 807 (56.2%) |
0 (0.0%) |
|
| 31 | Power_Steering [integer] |
Min : 0 Mean : 1 Max : 1 |
0 : 32 ( 2.2%) 1 : 1404 (97.8%) |
0 (0.0%) |
|
| 32 | Radio [integer] |
Min : 0 Mean : 0.1 Max : 1 |
0 : 1226 (85.4%) 1 : 210 (14.6%) |
0 (0.0%) |
|
| 33 | Mistlamps [integer] |
Min : 0 Mean : 0.3 Max : 1 |
0 : 1067 (74.3%) 1 : 369 (25.7%) |
0 (0.0%) |
|
| 34 | Sport_Model [integer] |
Min : 0 Mean : 0.3 Max : 1 |
0 : 1005 (70.0%) 1 : 431 (30.0%) |
0 (0.0%) |
|
| 35 | Backseat_Divider [integer] |
Min : 0 Mean : 0.8 Max : 1 |
0 : 330 (23.0%) 1 : 1106 (77.0%) |
0 (0.0%) |
|
| 36 | Metallic_Rim [integer] |
Min : 0 Mean : 0.2 Max : 1 |
0 : 1142 (79.5%) 1 : 294 (20.5%) |
0 (0.0%) |
|
| 37 | Radio_cassette [integer] |
Min : 0 Mean : 0.1 Max : 1 |
0 : 1227 (85.4%) 1 : 209 (14.6%) |
0 (0.0%) |
|
| 38 | Parking_Assistant [integer] |
Min : 0 Mean : 0 Max : 1 |
0 : 1432 (99.7%) 1 : 4 ( 0.3%) |
0 (0.0%) |
|
| 39 | Tow_Bar [integer] |
Min : 0 Mean : 0.3 Max : 1 |
0 : 1037 (72.2%) 1 : 399 (27.8%) |
0 (0.0%) |
# Missing Variables Plot for the Dataset
gg_miss_var(ToyotaDT, show_pct = TRUE)We can see that there is no missing values in our dataset ToyotaCorolla.csv
Split the data into training (50%), validation (30%), and test (20%) datasets.
# Setting Seed
set.seed(1)
# Splitting Training and Validation and Test
splitting <- sample(1:3,size=nrow(ToyotaDT),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ToyotaDT[splitting==1,]
Validation <- ToyotaDT[splitting==2,]
Test <- ToyotaDT[splitting==3,]
# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ToyotaDT))*100
Prop_Validation <- (nrow(Validation)/nrow(ToyotaDT))*100
Prop_Test <- (nrow(Test)/nrow(ToyotaDT))*100
# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")[1] “The Proportions are: 52.58 % In Training, 27.79 % In Validation, and 19.64 % In Test”
with the outcome variable Price and predictor variables Age_08, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.
Numerical: Price
Numerical: Age_08, KM, HP, Doors, Quarterly_Tax, Quarantee_Period
Categorical/Dummy: Fuel Type, Automatic, Mfr_Guarantee, Airco, Automatic_airco, CD_Player, Powered Windows, Sport_Model, Tow_Bar
# Linear OLS Regression on Training
Regression_Price <- lm(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training)
# Scientific Notation
options(scipen = 999)
summary(Regression_Price)FALSE
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic +
FALSE Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period +
FALSE Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
FALSE Tow_Bar, data = Training)
FALSE
FALSE Residuals:
FALSE Min 1Q Median 3Q Max
FALSE -4605.0 -725.0 26.5 655.5 5567.0
FALSE
FALSE Coefficients:
FALSE Estimate Std. Error t value Pr(>|t|)
FALSE (Intercept) 10149.303878 753.570928 13.468 < 0.0000000000000002 ***
FALSE Age_08_04 -106.254953 3.648627 -29.122 < 0.0000000000000002 ***
FALSE KM -0.018110 0.001693 -10.697 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel 1775.761977 509.701393 3.484 0.000523 ***
FALSE Fuel_TypePetrol 2087.180327 530.110397 3.937 0.0000902216317796 ***
FALSE HP 26.355347 3.846774 6.851 0.0000000000154436 ***
FALSE Automatic 605.249517 178.506754 3.391 0.000734 ***
FALSE Doors 153.627129 46.146978 3.329 0.000915 ***
FALSE Quarterly_Tax 16.814925 2.081831 8.077 0.0000000000000027 ***
FALSE Mfr_Guarantee 125.684485 91.024994 1.381 0.167768
FALSE Guarantee_Period 72.439768 15.661721 4.625 0.0000044171894508 ***
FALSE Airco 175.868563 106.848246 1.646 0.100197
FALSE Automatic_airco 3178.052845 209.461203 15.173 < 0.0000000000000002 ***
FALSE CD_Player 247.920668 120.057343 2.065 0.039270 *
FALSE Powered_Windows 426.529261 103.224416 4.132 0.0000400718888028 ***
FALSE Sport_Model 268.489425 98.070812 2.738 0.006336 **
FALSE Tow_Bar -250.505084 97.621305 -2.566 0.010481 *
FALSE ---
FALSE Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE
FALSE Residual standard error: 1155 on 738 degrees of freedom
FALSE Multiple R-squared: 0.8915, Adjusted R-squared: 0.8891
FALSE F-statistic: 378.8 on 16 and 738 DF, p-value: < 0.00000000000000022
Taking into account all the requested variables in our linear regression without further analysis or variable selection, we can observe a fairly high Multiple R-squared, close to 0.8915 and a fit of 0.8891. We have a very significant F-statistic, close to 0 which means that this model is already quite complex and better than a naive model including only the intercept. The most significant variables are the Age of the car, the number of kilometres (KM) and the automatic air conditioning (Automatic_airco), since all have the smallest p-value.
# use regsubsets() in package leaps to run an exhaustive search.
library(leaps)
# Duplicate the Dataset Training for Modification of Dummy
Training_Search <- Training
# create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + Fuel_Type, data=Training_Search))
# replace Fuel_Type column with 2 dummies
Training_Search <- cbind(Training_Search[,-8], Fuel_Type[,])
# Search
search <- regsubsets(Price ~ Age_08_04 + KM + Fuel_TypeCNG + Fuel_TypeDiesel + Fuel_TypePetrol + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training_Search, nbest = 1, nvmax = dim(Training_Search)[2],
method = "exhaustive")FALSE Reordering variables and trying again:
sum <- summary(search)# show models
sum$whichFALSE (Intercept) Age_08_04 KM Fuel_TypeCNG Fuel_TypeDiesel Fuel_TypePetrol
FALSE 1 TRUE TRUE FALSE FALSE FALSE FALSE
FALSE 2 TRUE TRUE FALSE FALSE FALSE FALSE
FALSE 3 TRUE TRUE TRUE FALSE FALSE FALSE
FALSE 4 TRUE TRUE TRUE FALSE FALSE FALSE
FALSE 5 TRUE TRUE TRUE FALSE FALSE FALSE
FALSE 6 TRUE TRUE TRUE FALSE FALSE FALSE
FALSE 7 TRUE TRUE TRUE FALSE FALSE FALSE
FALSE 8 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 9 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 10 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 11 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 12 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 13 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 14 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 15 TRUE TRUE TRUE TRUE FALSE FALSE
FALSE 16 TRUE TRUE TRUE TRUE TRUE FALSE
FALSE HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco
FALSE 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
FALSE 4 FALSE FALSE FALSE TRUE FALSE FALSE FALSE
FALSE 5 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
FALSE 6 TRUE FALSE FALSE TRUE FALSE FALSE FALSE
FALSE 7 TRUE FALSE FALSE TRUE FALSE TRUE FALSE
FALSE 8 TRUE FALSE FALSE TRUE FALSE TRUE FALSE
FALSE 9 TRUE TRUE FALSE TRUE FALSE TRUE FALSE
FALSE 10 TRUE TRUE TRUE TRUE FALSE TRUE FALSE
FALSE 11 TRUE TRUE TRUE TRUE FALSE TRUE FALSE
FALSE 12 TRUE TRUE TRUE TRUE FALSE TRUE FALSE
FALSE 13 TRUE TRUE TRUE TRUE FALSE TRUE FALSE
FALSE 14 TRUE TRUE TRUE TRUE FALSE TRUE TRUE
FALSE 15 TRUE TRUE TRUE TRUE TRUE TRUE TRUE
FALSE 16 TRUE TRUE TRUE TRUE TRUE TRUE TRUE
FALSE Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar
FALSE 1 FALSE FALSE FALSE FALSE FALSE
FALSE 2 TRUE FALSE FALSE FALSE FALSE
FALSE 3 TRUE FALSE FALSE FALSE FALSE
FALSE 4 TRUE FALSE FALSE FALSE FALSE
FALSE 5 TRUE FALSE FALSE FALSE FALSE
FALSE 6 TRUE FALSE TRUE FALSE FALSE
FALSE 7 TRUE FALSE TRUE FALSE FALSE
FALSE 8 TRUE FALSE TRUE FALSE FALSE
FALSE 9 TRUE FALSE TRUE FALSE FALSE
FALSE 10 TRUE FALSE TRUE FALSE FALSE
FALSE 11 TRUE FALSE TRUE TRUE FALSE
FALSE 12 TRUE FALSE TRUE TRUE TRUE
FALSE 13 TRUE TRUE TRUE TRUE TRUE
FALSE 14 TRUE TRUE TRUE TRUE TRUE
FALSE 15 TRUE TRUE TRUE TRUE TRUE
FALSE 16 TRUE TRUE TRUE TRUE TRUE
# show metrics
sum$rsqFALSE [1] 0.7655313 0.8300988 0.8503445 0.8620935 0.8759748 0.8801229 0.8822341
FALSE [8] 0.8844385 0.8861651 0.8874965 0.8888897 0.8897505 0.8905682 0.8909579
FALSE [15] 0.8913048 0.8914512
R-Squared keep increasing when we add more and more variables to the model, which is already expected since it doesn’t account for the number of parameter (no penalty)
sum$adjr2FALSE [1] 0.7652199 0.8296470 0.8497467 0.8613580 0.8751469 0.8791614 0.8811305
FALSE [8] 0.8831993 0.8847899 0.8859844 0.8872447 0.8879675 0.8886484 0.8888949
FALSE [15] 0.8890985 0.8890979
Adjusted R-Squared keep increasing until the 15th value, 0.8890985 which is slight higher than the last one (16th).
sum$cpFALSE [1] 840.94284 404.55680 269.09689 191.32625 99.07820 72.91427 60.58056
FALSE [8] 47.61333 37.89066 30.85073 23.39186 19.54705 15.99520 15.34962
FALSE [15] 14.99434 16.00000
The closest Cp to predictors+1 is the last one (16th value) which is 16, close to 17 = p+1 = 16+1.
# Setting Forward Selection
Regression_Price_Forward <- step(Regression_Price, direction = "forward")FALSE Start: AIC=10665.11
FALSE Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +
FALSE Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco +
FALSE Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
FALSE Tow_Bar
summary(Regression_Price_Forward) FALSE
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic +
FALSE Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period +
FALSE Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
FALSE Tow_Bar, data = Training)
FALSE
FALSE Residuals:
FALSE Min 1Q Median 3Q Max
FALSE -4605.0 -725.0 26.5 655.5 5567.0
FALSE
FALSE Coefficients:
FALSE Estimate Std. Error t value Pr(>|t|)
FALSE (Intercept) 10149.303878 753.570928 13.468 < 0.0000000000000002 ***
FALSE Age_08_04 -106.254953 3.648627 -29.122 < 0.0000000000000002 ***
FALSE KM -0.018110 0.001693 -10.697 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel 1775.761977 509.701393 3.484 0.000523 ***
FALSE Fuel_TypePetrol 2087.180327 530.110397 3.937 0.0000902216317796 ***
FALSE HP 26.355347 3.846774 6.851 0.0000000000154436 ***
FALSE Automatic 605.249517 178.506754 3.391 0.000734 ***
FALSE Doors 153.627129 46.146978 3.329 0.000915 ***
FALSE Quarterly_Tax 16.814925 2.081831 8.077 0.0000000000000027 ***
FALSE Mfr_Guarantee 125.684485 91.024994 1.381 0.167768
FALSE Guarantee_Period 72.439768 15.661721 4.625 0.0000044171894508 ***
FALSE Airco 175.868563 106.848246 1.646 0.100197
FALSE Automatic_airco 3178.052845 209.461203 15.173 < 0.0000000000000002 ***
FALSE CD_Player 247.920668 120.057343 2.065 0.039270 *
FALSE Powered_Windows 426.529261 103.224416 4.132 0.0000400718888028 ***
FALSE Sport_Model 268.489425 98.070812 2.738 0.006336 **
FALSE Tow_Bar -250.505084 97.621305 -2.566 0.010481 *
FALSE ---
FALSE Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE
FALSE Residual standard error: 1155 on 738 degrees of freedom
FALSE Multiple R-squared: 0.8915, Adjusted R-squared: 0.8891
FALSE F-statistic: 378.8 on 16 and 738 DF, p-value: < 0.00000000000000022
# Setting Backward Selection
Regression_Price_Backward <- step(Regression_Price, direction = "backward")FALSE Start: AIC=10665.11
FALSE Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +
FALSE Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco +
FALSE Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
FALSE Tow_Bar
FALSE
FALSE Df Sum of Sq RSS AIC
FALSE - Mfr_Guarantee 1 2543346 987052419 10665
FALSE <none> 984509073 10665
FALSE - Airco 1 3614140 988123213 10666
FALSE - CD_Player 1 5688678 990197751 10668
FALSE - Tow_Bar 1 8784301 993293374 10670
FALSE - Sport_Model 1 9998577 994507650 10671
FALSE - Doors 1 14784705 999293778 10674
FALSE - Automatic 1 15336383 999845456 10675
FALSE - Fuel_Type 2 20941946 1005451019 10677
FALSE - Powered_Windows 1 22776981 1007286054 10680
FALSE - Guarantee_Period 1 28538984 1013048057 10685
FALSE - HP 1 62619200 1047128273 10710
FALSE - Quarterly_Tax 1 87028661 1071537733 10727
FALSE - KM 1 152653191 1137162264 10772
FALSE - Automatic_airco 1 307099000 1291608073 10868
FALSE - Age_08_04 1 1131365100 2115874173 11241
FALSE
FALSE Step: AIC=10665.05
FALSE Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +
FALSE Quarterly_Tax + Guarantee_Period + Airco + Automatic_airco +
FALSE CD_Player + Powered_Windows + Sport_Model + Tow_Bar
FALSE
FALSE Df Sum of Sq RSS AIC
FALSE <none> 987052419 10665
FALSE - Airco 1 3591518 990643937 10666
FALSE - CD_Player 1 6457971 993510389 10668
FALSE - Tow_Bar 1 8540281 995592700 10670
FALSE - Sport_Model 1 9978477 997030896 10671
FALSE - Doors 1 15120231 1002172650 10674
FALSE - Automatic 1 15145014 1002197433 10675
FALSE - Fuel_Type 2 22008962 1009061381 10678
FALSE - Powered_Windows 1 21946957 1008999375 10680
FALSE - Guarantee_Period 1 27250734 1014303152 10684
FALSE - HP 1 63791761 1050844179 10710
FALSE - Quarterly_Tax 1 91041143 1078093561 10730
FALSE - KM 1 154155059 1141207477 10773
FALSE - Automatic_airco 1 306222171 1293274589 10867
FALSE - Age_08_04 1 1146445473 2133497891 11245
summary(Regression_Price_Backward) FALSE
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic +
FALSE Doors + Quarterly_Tax + Guarantee_Period + Airco + Automatic_airco +
FALSE CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training)
FALSE
FALSE Residuals:
FALSE Min 1Q Median 3Q Max
FALSE -4582.7 -725.6 25.2 660.5 5528.3
FALSE
FALSE Coefficients:
FALSE Estimate Std. Error t value Pr(>|t|)
FALSE (Intercept) 10131.277848 753.919822 13.438 < 0.0000000000000002 ***
FALSE Age_08_04 -106.642999 3.640018 -29.297 < 0.0000000000000002 ***
FALSE KM -0.018189 0.001693 -10.743 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel 1771.331629 510.003814 3.473 0.000544 ***
FALSE Fuel_TypePetrol 2143.129536 528.883581 4.052 0.000056116325460097 ***
FALSE HP 26.577649 3.845760 6.911 0.000000000010411626 ***
FALSE Automatic 601.387645 178.594281 3.367 0.000798 ***
FALSE Doors 155.306593 46.159231 3.365 0.000806 ***
FALSE Quarterly_Tax 17.108350 2.072226 8.256 0.000000000000000692 ***
FALSE Guarantee_Period 70.500818 15.608204 4.517 0.000007302497428398 ***
FALSE Airco 175.316066 106.913011 1.640 0.101472
FALSE Automatic_airco 3173.034931 209.558088 15.142 < 0.0000000000000002 ***
FALSE CD_Player 263.050250 119.629577 2.199 0.028196 *
FALSE Powered_Windows 417.921196 103.099161 4.054 0.000055786711372138 ***
FALSE Sport_Model 268.218889 98.130749 2.733 0.006421 **
FALSE Tow_Bar -246.913474 97.646480 -2.529 0.011657 *
FALSE ---
FALSE Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE
FALSE Residual standard error: 1156 on 739 degrees of freedom
FALSE Multiple R-squared: 0.8912, Adjusted R-squared: 0.889
FALSE F-statistic: 403.4 on 15 and 739 DF, p-value: < 0.00000000000000022
# Setting Backward Selection
Regression_Price_Backward <- step(Regression_Price, direction = "both")FALSE Start: AIC=10665.11
FALSE Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +
FALSE Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco +
FALSE Automatic_airco + CD_Player + Powered_Windows + Sport_Model +
FALSE Tow_Bar
FALSE
FALSE Df Sum of Sq RSS AIC
FALSE - Mfr_Guarantee 1 2543346 987052419 10665
FALSE <none> 984509073 10665
FALSE - Airco 1 3614140 988123213 10666
FALSE - CD_Player 1 5688678 990197751 10668
FALSE - Tow_Bar 1 8784301 993293374 10670
FALSE - Sport_Model 1 9998577 994507650 10671
FALSE - Doors 1 14784705 999293778 10674
FALSE - Automatic 1 15336383 999845456 10675
FALSE - Fuel_Type 2 20941946 1005451019 10677
FALSE - Powered_Windows 1 22776981 1007286054 10680
FALSE - Guarantee_Period 1 28538984 1013048057 10685
FALSE - HP 1 62619200 1047128273 10710
FALSE - Quarterly_Tax 1 87028661 1071537733 10727
FALSE - KM 1 152653191 1137162264 10772
FALSE - Automatic_airco 1 307099000 1291608073 10868
FALSE - Age_08_04 1 1131365100 2115874173 11241
FALSE
FALSE Step: AIC=10665.05
FALSE Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +
FALSE Quarterly_Tax + Guarantee_Period + Airco + Automatic_airco +
FALSE CD_Player + Powered_Windows + Sport_Model + Tow_Bar
FALSE
FALSE Df Sum of Sq RSS AIC
FALSE <none> 987052419 10665
FALSE + Mfr_Guarantee 1 2543346 984509073 10665
FALSE - Airco 1 3591518 990643937 10666
FALSE - CD_Player 1 6457971 993510389 10668
FALSE - Tow_Bar 1 8540281 995592700 10670
FALSE - Sport_Model 1 9978477 997030896 10671
FALSE - Doors 1 15120231 1002172650 10674
FALSE - Automatic 1 15145014 1002197433 10675
FALSE - Fuel_Type 2 22008962 1009061381 10678
FALSE - Powered_Windows 1 21946957 1008999375 10680
FALSE - Guarantee_Period 1 27250734 1014303152 10684
FALSE - HP 1 63791761 1050844179 10710
FALSE - Quarterly_Tax 1 91041143 1078093561 10730
FALSE - KM 1 154155059 1141207477 10773
FALSE - Automatic_airco 1 306222171 1293274589 10867
FALSE - Age_08_04 1 1146445473 2133497891 11245
summary(Regression_Price_Backward)FALSE
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic +
FALSE Doors + Quarterly_Tax + Guarantee_Period + Airco + Automatic_airco +
FALSE CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training)
FALSE
FALSE Residuals:
FALSE Min 1Q Median 3Q Max
FALSE -4582.7 -725.6 25.2 660.5 5528.3
FALSE
FALSE Coefficients:
FALSE Estimate Std. Error t value Pr(>|t|)
FALSE (Intercept) 10131.277848 753.919822 13.438 < 0.0000000000000002 ***
FALSE Age_08_04 -106.642999 3.640018 -29.297 < 0.0000000000000002 ***
FALSE KM -0.018189 0.001693 -10.743 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel 1771.331629 510.003814 3.473 0.000544 ***
FALSE Fuel_TypePetrol 2143.129536 528.883581 4.052 0.000056116325460097 ***
FALSE HP 26.577649 3.845760 6.911 0.000000000010411626 ***
FALSE Automatic 601.387645 178.594281 3.367 0.000798 ***
FALSE Doors 155.306593 46.159231 3.365 0.000806 ***
FALSE Quarterly_Tax 17.108350 2.072226 8.256 0.000000000000000692 ***
FALSE Guarantee_Period 70.500818 15.608204 4.517 0.000007302497428398 ***
FALSE Airco 175.316066 106.913011 1.640 0.101472
FALSE Automatic_airco 3173.034931 209.558088 15.142 < 0.0000000000000002 ***
FALSE CD_Player 263.050250 119.629577 2.199 0.028196 *
FALSE Powered_Windows 417.921196 103.099161 4.054 0.000055786711372138 ***
FALSE Sport_Model 268.218889 98.130749 2.733 0.006421 **
FALSE Tow_Bar -246.913474 97.646480 -2.529 0.011657 *
FALSE ---
FALSE Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE
FALSE Residual standard error: 1156 on 739 degrees of freedom
FALSE Multiple R-squared: 0.8912, Adjusted R-squared: 0.889
FALSE F-statistic: 403.4 on 15 and 739 DF, p-value: < 0.00000000000000022
We can see that in both, Backward and Stepwise, only Mfr_Guarantee is dropped. Forward Selection keep all variables.
After the Exhaustive Search and Popular Subset Selection Algorithms, we should have those 3-4 most important variable for predicting car’s price: Age, KM, Automatic_airco and Quarterly_Tax. Depending if we decided to drop the variable Mfr_Guarante, we might get slight different results in predictions, but the most important variables for predictions should stay the same. The Following Performance Assessment keep the same full model.
library(forecast)
# use predict() to make predictions on a new set.
car.lm.pred <- predict(Regression_Price, Validation)
options(scipen=999, digits = 0)
some.residuals <- Validation$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = Validation$Price[1:20],
"Residual" = some.residuals)FALSE Predicted Actual Residual
FALSE 1 15327 13950 -1377
FALSE 2 14920 18600 3680
FALSE 3 17033 21500 4467
FALSE 4 20592 19600 -992
FALSE 5 20061 22500 2439
FALSE 6 19881 22750 2869
FALSE 7 14134 16950 2816
FALSE 8 17294 15950 -1344
FALSE 9 14919 15750 831
FALSE 10 14604 15750 1146
FALSE 11 14910 15950 1040
FALSE 12 14286 15750 1464
FALSE 13 14478 16750 2272
FALSE 14 14107 13950 -157
FALSE 15 15345 16950 1605
FALSE 16 16061 16950 889
FALSE 17 19033 19000 -33
FALSE 18 16568 17950 1382
FALSE 19 19668 21950 2282
FALSE 20 14663 15250 587
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, Validation$Price)FALSE ME RMSE MAE MPE MAPE
FALSE Test set 83 1319 948 -0.789 8.87
Here the resulting metrics for our predictions errors. The closest to 0 the better.
library(forecast)
car.lm.pred <- predict(Regression_Price, Validation)
all.residuals <- Validation$Price - car.lm.pred
length(all.residuals[which(all.residuals > -1406 & all.residuals < 1406)])/400FALSE [1] 0.802
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "", col = "#1c6155")We can the residuals of our predictions on the validation set. The Spread is clearly visible, between -2000 and 2000 and normally distributed. Some extremes values for our residuals appear on the far right,.
# Set Seed
set.seed(1)
ToyotaDT_TREE <- fread("DATA/ToyotaCorolla.csv")
# Input Cutting Ratio
Prob_Train <- 0.6
Prob_Validation <- 1 - Prob_Train
# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_TREE), replace=TRUE, prob=c(Prob_Train,Prob_Validation))
Training_TREE <- ToyotaDT_TREE[sample, ]
Validation_TREE <- ToyotaDT_TREE[!sample, ]
# Proportions Check
Prop_Train <- nrow(Training_TREE)/nrow(ToyotaDT_TREE)*100
Prop_Valid <- nrow(Validation_TREE)/nrow(ToyotaDT_TREE)*100
# Printing Proportions for double checking
print(paste(round(Prop_Train,2),"% In Training", round(Prop_Valid,2),"% In Validation"))FALSE [1] "61.42 % In Training 38.58 % In Validation"
with CP=0.001 and “ANOVA” Method
# Set Seed
set.seed(1)
# Regression Tree Packages
library(rpart)
library(rpart.plot)
# As Factor Fuel_Type
#ToyotaDT_TREE$Fuel_Type <- as.factor(ToyotaDT_TREE$Fuel_Type)
# Regression Tree Parameters
cp_1 = 0.001
method = "anova"
minbucket = 1
maxdepth = 30 #30 maximum
# Running Regression Tree
Tree_1 <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_1,minbucket = minbucket, method=method))
# Plotting Regression Tree
Tree_1_Plot <- rpart.plot(Tree_1, type=0, varlen = 0, box.col=ifelse(Tree_1$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.7)# Number of Leafs
Length_Tree_1 = length(Tree_1$frame$var[Tree_1$frame$var == "<leaf>"])
print(paste("There is", Length_Tree_1,"Number of Leaves"))FALSE [1] "There is 31 Number of Leaves"
# Tree Size
plotcp(Tree_1)With small CP, Our Regression Tree can go until 31 Leaves… But we see that the relative error seems to already be small enough between 9 and 13 Leaves.
# Most 3 or 4 specifications for predictions in car's price
Tree_1FALSE n= 882
FALSE
FALSE node), split, n, deviance, yval
FALSE * denotes terminal node
FALSE
FALSE 1) root 882 11000000000 10700
FALSE 2) Age_08_04>=31.5 773 3070000000 9630
FALSE 4) Age_08_04>=56.5 516 839000000 8670
FALSE 8) Age_08_04>=68.5 248 283000000 7970
FALSE 16) KM>=9.94e+04 71 91800000 7400
FALSE 32) KM>=2e+05 5 3580000 5700 *
FALSE 33) KM< 2e+05 66 72600000 7530 *
FALSE 17) KM< 9.94e+04 177 160000000 8190 *
FALSE 9) Age_08_04< 68.5 268 316000000 9330
FALSE 18) KM>=1.38e+05 11 9700000 7390 *
FALSE 19) KM< 1.38e+05 257 263000000 9410
FALSE 38) Quarterly_Tax< 78.5 144 130000000 9070
FALSE 76) KM>=4.23e+04 126 101000000 8940
FALSE 152) Airco< 0.5 86 59000000 8730 *
FALSE 153) Airco>=0.5 40 30600000 9390 *
FALSE 77) KM< 4.23e+04 18 12700000 9950 *
FALSE 39) Quarterly_Tax>=78.5 113 94200000 9850
FALSE 78) KM>=8.82e+04 34 17000000 9270 *
FALSE 79) KM< 8.82e+04 79 60600000 10100 *
FALSE 5) Age_08_04< 56.5 257 830000000 11500
FALSE 10) KM>=1.28e+05 13 35200000 7040
FALSE 20) Quarterly_Tax< 74.5 4 7930000 5320 *
FALSE 21) Quarterly_Tax>=74.5 9 10200000 7810 *
FALSE 11) KM< 1.28e+05 244 518000000 11800
FALSE 22) Age_08_04>=44.5 119 210000000 11000
FALSE 44) Airco< 0.5 57 56200000 10600
FALSE 88) Mfr_Guarantee< 0.5 33 25500000 10200 *
FALSE 89) Mfr_Guarantee>=0.5 24 18300000 11100 *
FALSE 45) Airco>=0.5 62 132000000 11400
FALSE 90) KM>=5.66e+04 41 35600000 11000 *
FALSE 91) KM< 5.66e+04 21 71700000 12300
FALSE 182) Quarterly_Tax>=44 19 24300000 11900 *
FALSE 183) Quarterly_Tax< 44 2 18000000 16000 *
FALSE 23) Age_08_04< 44.5 125 172000000 12500
FALSE 46) Powered_Windows< 0.5 49 58700000 12000 *
FALSE 47) Powered_Windows>=0.5 76 90100000 12800 *
FALSE 3) Age_08_04< 31.5 109 896000000 18200
FALSE 6) HP< 113 100 506000000 17700
FALSE 12) Age_08_04>=22.5 39 137000000 16400
FALSE 24) HP< 104 17 30100000 14900
FALSE 48) Sport_Model< 0.5 6 929000 13500 *
FALSE 49) Sport_Model>=0.5 11 9230000 15700 *
FALSE 25) HP>=104 22 40000000 17600 *
FALSE 13) Age_08_04< 22.5 61 261000000 18600
FALSE 26) HP< 104 30 120000000 17600
FALSE 52) Quarterly_Tax< 222 24 26000000 17000
FALSE 104) Age_08_04>=14 19 11500000 16600 *
FALSE 105) Age_08_04< 14 5 740000 18500 *
FALSE 53) Quarterly_Tax>=222 6 43700000 20200
FALSE 106) KM>=4.43e+04 4 5370000 18500 *
FALSE 107) KM< 4.43e+04 2 3750000 23600 *
FALSE 27) HP>=104 31 91800000 19500
FALSE 54) Quarterly_Tax< 44 5 6780000 17000 *
FALSE 55) Quarterly_Tax>=44 26 48000000 19900
FALSE 110) Airco< 0.5 1 0 16400 *
FALSE 111) Airco>=0.5 25 34700000 20100 *
FALSE 7) HP>=113 9 113000000 23500
FALSE 14) Quarterly_Tax< 258 8 22600000 22400
FALSE 28) Age_08_04>=15 5 2890000 21300 *
FALSE 29) Age_08_04< 15 3 2670000 24300 *
FALSE 15) Quarterly_Tax>=258 1 0 32500 *
Tree_1_Importance <- as.data.frame(Tree_1$variable.importance)
DT::datatable(Tree_1_Importance)Using our Regression Tree output, we can see that the first 4 split are based on Age only. Then the 4 most important variables are Age_08_04, KM, Quarterly_Tax and HP.
# Predictions of Training and Validation
Training_Predictions <- predict(Tree_1,Training_TREE)
Validation_Predictions <- predict(Tree_1, Validation_TREE)
# RMSE ------------------------------------------------------------------------
# Training RMSE
RMSE_Training <- RMSE(Training_Predictions,Training_TREE$Price)
# Validation RMSE
RMSE_Validation <- RMSE(Validation_Predictions, Validation_TREE$Price)
# All RMSE
BIND_Training_Validation <- cbind(RMSE_Training,RMSE_Validation)
RMSE_Dataframe <- as.data.frame(BIND_Training_Validation)
flextable(RMSE_Dataframe) %>% set_header_labels(RMSE_Dataframe, values = list(RMSE_Training="RMSE Training",RMSE_Validation="RMSE Validation")
)RMSE Training | RMSE Validation |
|---|---|
972 | 1,318 |
# Computing Residuals --------------------------------------------------------------
Residuals_Training_Tree_1 <- Training_TREE$Price - Training_Predictions
Residuals_Validation_Tree_1 <- Validation_TREE$Price - Validation_Predictions
Residuals_Training_Tree_1 <- as.data.frame(Residuals_Training_Tree_1)
Residuals_Validation_Tree_1 <- as.data.frame(Residuals_Validation_Tree_1)
# Boxplots -------------------------------------------------------------------------
library(ggplot2)
# Boxplot Training
box1 <- ggplotly(ggplot(Residuals_Training_Tree_1) +
aes(x = "", y = Residuals_Training_Tree_1) +
geom_boxplot(fill = "#1c6155") +
labs(x = "with 882 observations", y = "Price Residuals", title = "Residuals Training VS Predicted", subtitle = "Boxplot",
caption = "") +
theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))
# Boxplot Validation
box2 <- ggplotly(ggplot(Residuals_Validation_Tree_1) +
aes(x = "", y = Residuals_Validation_Tree_1) +
geom_boxplot(fill = "#1c6155") +
labs(x = "with 554 observations", y = "Price Residuals", title = "Residuals Validation VS Predicted", subtitle = "Boxplot",
caption = "") +
theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))
# Combine them
combineWidgets(box1,box2, ncol=2)blablababla
CP Table of our full Tree Regression (CP=0)
cp_2 = 0
Tree_Full <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_2,minbucket = minbucket, method=method))
DT::datatable(Tree_Full$cptable)Pruned_Tree <- prune(Tree_Full, cp = best_CP)
# Plotting Pruned Tree
Pruned_Tree_Plot <- rpart.plot(Pruned_Tree, type=0, varlen = 0, box.col=ifelse(Pruned_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.7)# Number of Leafs
Length_Tree_Pruned = length(Pruned_Tree$frame$var[Pruned_Tree$frame$var == "<leaf>"])
print(paste("There is", Length_Tree_Pruned,"Number of Leaves"))FALSE [1] "There is 44 Number of Leaves"
# Predicting
Predictions_Full_Tree <- predict(Tree_Full, Validation_TREE)
Predictions_Prune_Tree <- predict(Pruned_Tree, Validation_TREE)
# RMSE of Full and Prune Tree
RMSE_Full_Tree <- RMSE(Validation_TREE$Price, Predictions_Full_Tree)
RMSE_Prune_Tree <- RMSE(Validation_TREE$Price, Predictions_Prune_Tree)
# Data frame RMSE
RMSE_Full_Tree_2 <- cbind(RMSE_Full_Tree,RMSE_Prune_Tree,RMSE_Validation)
RMSE_Full_Tree_2 <- as.data.frame(RMSE_Full_Tree_2)
# Printing Table
flextable(RMSE_Full_Tree_2) %>% set_header_labels(RMSE_Full_Tree_2,RMSE_Full_Tree="RMSE Validation - Full Tree with CP=0",RMSE_Prune_Tree="RMSE Validation - Prune Tree with CP=0.0008", RMSE_Validation="RMSE Validation - First Tree with CP=0.001")RMSE Validation - Full Tree with CP=0 | RMSE Validation - Prune Tree with CP=0.0008 | RMSE Validation - First Tree with CP=0.001 |
|---|---|---|
1,555 | 1,364 | 1,318 |
We can see that Prunning our Regression Tree helped us getting a smaller RMSE compared to the Full Tree, but with have slightly higher RMSE compared to the Regression Tree asked in the first questions with CP=0.001.
Here are the Binned_Price Variable into 20 levels
# Duplicating Training Data
New_Training_Tree <- Training_TREE
# Cutting into 20 breaks the Price Variable
New_Training_Tree$Price <- cut(New_Training_Tree$Price, breaks = 20)
# Renaming Price -> Binned_Price
colnames(New_Training_Tree)[3] <- "Binned_Price"
# Printing Levels of Binned_Price
print(paste(levels(New_Training_Tree$Binned_Price)))FALSE [1] "(4.32e+03,5.76e+03]" "(5.76e+03,7.16e+03]" "(7.16e+03,8.57e+03]"
FALSE [4] "(8.57e+03,9.98e+03]" "(9.98e+03,1.14e+04]" "(1.14e+04,1.28e+04]"
FALSE [7] "(1.28e+04,1.42e+04]" "(1.42e+04,1.56e+04]" "(1.56e+04,1.7e+04]"
FALSE [10] "(1.7e+04,1.84e+04]" "(1.84e+04,1.98e+04]" "(1.98e+04,2.12e+04]"
FALSE [13] "(2.12e+04,2.26e+04]" "(2.26e+04,2.41e+04]" "(2.41e+04,2.55e+04]"
FALSE [16] "(2.55e+04,2.69e+04]" "(2.69e+04,2.83e+04]" "(2.83e+04,2.97e+04]"
FALSE [19] "(2.97e+04,3.11e+04]" "(3.11e+04,3.25e+04]"
# Running Class Tree
CT_Tree <- rpart(Binned_Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = New_Training_Tree, method = "class")
# Plotting Tree
CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.7)# Number of Leafs
Length_Tree_CT = length(CT_Tree$frame$var[CT_Tree$frame$var == "<leaf>"])
print(paste("There is", Length_Tree_CT,"Number of Leaves"))FALSE [1] "There is 8 Number of Leaves"
We can see that this Class Tree is having 8 Terminal Nodes (or Leaves) and way smaller in depth compared to the regression Tree we used in the exercices before. There is up to 4 layers in this CT.
# Most 3 or 4 specifications for predictions in car's price
CT_Tree_Importance <- as.data.frame(CT_Tree$variable.importance)
DT::datatable(CT_Tree_Importance)Using our Class Tree output, we can see that the 4 most important variables are Age_08_04, KM, CD_Player and Airco
# Dataframe Car Specifcations for Predictions
Prediction_1_Car <- data.frame("Age_08_04"=77,"KM"=117000, "Fuel_Type"="Petrol","HP"=110,"Automatic"=0,"Doors"=5,"Quarterly_Tax"=100,"Mfr_Guarantee"=0,"Guarantee_Period"=3, "Airco"=1, "Automatic"=0, "CD_Player"=0, "Powered_Windows"=0, "Sport_Model"=0, "Tow_Bar"=1)
# Predicting Car Price with CT
Predicted_Car_Price <- predict(CT_Tree,Prediction_1_Car)
# Removing 0 Probabilities
Predicted_Car_Price <- Predicted_Car_Price[, colSums(Predicted_Car_Price != 0) > 0]
# As Dataframe for ggplotting
Predicted_Car_Price <- as.data.frame(Predicted_Car_Price)
# Renaming Column
colnames(Predicted_Car_Price) <- "Probabilities in %"
# Probabilities Rounding and %
Predicted_Car_Price$`Probabilities in %` <- Predicted_Car_Price$`Probabilities in %` * 100
Predicted_Car_Price$`Probabilities in %` <- round(Predicted_Car_Price$`Probabilities in %`,2)
# Flextable
DT::datatable(Predicted_Car_Price)CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.8)By using the CT Tree, we first start with Age_08_04 >= 57 -> Yes, then Age_08_04 >=70 -> Yes, then KM >= 174000 -> No and we stop at (7.16e+03,8.57e+03] which is 53.57% probability for this specific car. We are using far less steps to get to our predictions compared to the Regression Tree. By using CT instead of RT, we are not getting an exact value of price, but instead a probability distribution of the different Price Bins our car could be in and thus selecting the highest probability would get our car’s price into a category. CT have less precisions in outcome predictions but have far less branches and terminal nodes, thus more easy for reading and understanding each variables importance in the prediction process, as well as giving probabilities, which can be useful depending on the context.
The Goal is to predict the price of a used Toyota Corolla based on its specifications.
# Reproducible Results
set.seed(1)
# required Packages
library(neuralnet)
library(nnet)
library(caret)
# Load the Dataset with Fread()
ToyotaDT_NN <- fread("DATA/ToyotaCorolla.csv")
# Select the 15 variables
ToyotaDT_NN <- ToyotaDT_NN[,c("Price","Age_08_04","KM","Fuel_Type","HP","Automatic","Doors","Quarterly_Tax","Mfr_Guarantee","Guarantee_Period","Airco","Automatic_airco","CD_Player","Powered_Windows","Sport_Model","Tow_Bar")]
# Input Cutting Ratio
probability_train <- 0.6
probability_test <- 1 - probability_train
# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_NN), replace=TRUE, prob=c(probability_train,probability_test))
Train_NN <- ToyotaDT_NN[sample, ]
Test_NN <- ToyotaDT_NN[!sample, ]
# Proportions Check
Prop_Training <- nrow(Train_NN)/nrow(ToyotaDT_NN)*100
Prop_Validation <- nrow(Test_NN)/nrow(ToyotaDT_NN)*100
# Preprocess Variables - 8 Dummies - Fuel Type to Dummy - 7 Numerical
## create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + Fuel_Type, data=Train_NN))
## replace Fuel_Type column with 2 dummies
Train_NN <- cbind(Train_NN[,-c("Fuel_Type")], Fuel_Type[,])
## replace Fuel_Type column with 2 dummies
Test_NN <- cbind(Test_NN[,-c("Fuel_Type")], Fuel_Type[,])
## Numerical Processing with Training Set
preProcValues <- preProcess(Train_NN[,c("Price","Age_08_04","KM","HP","Doors","Quarterly_Tax","Guarantee_Period")], method = "range")
## Preprocess the Training Set
Train_NN_Preprocess <- predict(preProcValues, Train_NN)
## Preprocess the Test Set
Test_NN_Preprocess <- predict(preProcValues, Test_NN)
# Print Proportions
print(paste(round(Prop_Training,2),"% In Training", round(Prop_Validation,2),"% In Validation"))FALSE [1] "61.42 % In Training 38.58 % In Validation"
# display weights
nn_1$weightsFALSE [[1]]
FALSE [[1]][[1]]
FALSE [,1] [,2]
FALSE [1,] -0.4309 2.209
FALSE [2,] 2.0122 3.652
FALSE [3,] 0.8866 -1.037
FALSE [4,] -0.7398 -1.433
FALSE [5,] -0.1306 0.255
FALSE [6,] -0.1019 -0.201
FALSE [7,] -0.6180 1.546
FALSE [8,] -0.0805 -0.176
FALSE [9,] 0.1344 16.006
FALSE [10,] -0.0968 -0.293
FALSE [11,] -0.2474 35.270
FALSE [12,] 0.0789 1.118
FALSE [13,] -0.1134 -0.299
FALSE [14,] 0.0771 1.787
FALSE [15,] 0.1233 1.682
FALSE [16,] 0.7030 -1.676
FALSE [17,] 0.5789 -1.406
FALSE [18,] 0.7630 0.267
FALSE
FALSE [[1]][[2]]
FALSE [,1]
FALSE [1,] -0.365
FALSE [2,] -1.028
FALSE [3,] 1.419
# display weights
nn_2$weightsFALSE [[1]]
FALSE [[1]][[1]]
FALSE [,1] [,2] [,3] [,4] [,5]
FALSE [1,] -1.0822 0.2664 -0.7609 1.869 0.39221
FALSE [2,] -1.6464 -0.6606 -1.4234 1.682 -1.92659
FALSE [3,] -0.3819 0.6737 -0.8228 -0.520 -1.17504
FALSE [4,] 1.1282 -0.0761 0.9746 -0.426 0.19693
FALSE [5,] -0.8555 -5.9541 0.8139 2.845 0.15236
FALSE [6,] -0.2748 -0.8334 0.4679 0.254 0.16805
FALSE [7,] 0.0783 -4.9264 1.6873 2.500 -0.09956
FALSE [8,] 0.4218 0.6210 0.2834 -0.246 -0.39992
FALSE [9,] 1.3905 -35.2628 -1.0621 18.735 -0.72520
FALSE [10,] 0.0566 0.4204 0.0491 -0.165 0.08564
FALSE [11,] 1.2989 4.3655 -0.5550 -0.871 0.03334
FALSE [12,] -0.2900 -1.4439 1.0142 1.175 -0.82484
FALSE [13,] -0.1653 1.9985 0.1753 -1.415 0.40796
FALSE [14,] -0.3060 -91.3117 0.2948 2.309 -0.00232
FALSE [15,] 0.2535 -78.1617 -0.0750 2.528 -0.44375
FALSE [16,] 1.1113 1.3446 -1.7674 -1.909 -1.48382
FALSE [17,] 0.5031 -1.6160 -1.2026 -0.971 -0.16114
FALSE [18,] 0.4911 -1.0049 -1.5398 -0.259 -0.24136
FALSE
FALSE [[1]][[2]]
FALSE [,1]
FALSE [1,] -0.743
FALSE [2,] 0.439
FALSE [3,] 0.201
FALSE [4,] 0.414
FALSE [5,] 0.743
FALSE [6,] 0.496
# display weights
nn_3$weightsFALSE [[1]]
FALSE [[1]][[1]]
FALSE [,1] [,2] [,3] [,4] [,5]
FALSE [1,] 1.0014 0.8207 -0.263 -1.308 1.154
FALSE [2,] 2.6944 -3.8210 2.189 10.242 -2.117
FALSE [3,] 1.9919 -1.8167 0.679 -1.140 0.783
FALSE [4,] 2.6962 1.9304 -0.935 -5.662 9.611
FALSE [5,] -2.0652 -0.1293 -0.492 1.182 0.806
FALSE [6,] 0.0779 -0.6930 -1.155 -0.298 -0.300
FALSE [7,] -5.3986 3.0127 1.366 4.488 12.672
FALSE [8,] 2.3418 0.8618 0.378 -0.762 0.396
FALSE [9,] -3.7351 -1.2264 -1.887 33.366 -3.809
FALSE [10,] -1.1707 -0.2756 -0.432 -1.375 -2.054
FALSE [11,] -5.5473 -0.0329 -0.332 4.946 -2.549
FALSE [12,] 1.0162 0.2500 0.318 3.204 6.706
FALSE [13,] -0.8608 0.4563 0.167 0.141 19.021
FALSE [14,] -4.3215 -0.0348 0.619 1.386 -1.194
FALSE [15,] 0.3951 0.1364 0.283 0.611 4.325
FALSE [16,] -2.9950 -3.3677 -0.303 -3.822 -341.739
FALSE [17,] 4.2604 -3.1110 -2.016 -5.497 -36.463
FALSE [18,] 0.9853 -0.4030 -0.868 0.543 0.255
FALSE
FALSE [[1]][[2]]
FALSE [,1] [,2] [,3] [,4] [,5]
FALSE [1,] -0.495 -1.264 -0.924 -3.26 -0.322
FALSE [2,] 1.039 -1.681 0.586 -2.20 -0.930
FALSE [3,] 36.661 -2.006 -0.832 -11.08 0.957
FALSE [4,] 0.554 0.297 2.981 48.03 -0.701
FALSE [5,] 0.937 -0.477 0.413 8.22 0.725
FALSE [6,] 0.718 2.243 1.117 1.29 -0.291
FALSE
FALSE [[1]][[3]]
FALSE [,1]
FALSE [1,] -0.624
FALSE [2,] 1.224
FALSE [3,] -0.286
FALSE [4,] -0.324
FALSE [5,] -0.247
FALSE [6,] 0.745
# Preprocess Scale - Range Method ----------------------------
# Predictions for Training -----------------------------------
# Predictions with nn
Train_Prediction_nn_1 <- predict(nn_1,Train_NN_Preprocess)
# Predictions with nn_2
Train_Prediction_nn_2 <- predict(nn_2,Train_NN_Preprocess)
# Predictions with nn_3
Train_Prediction_nn_3 <- predict(nn_3,Train_NN_Preprocess)
# Back transform to Original Scale ---------------------------
# Predictions with nn
Train_Prediction_nn_1 <- Train_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# Predictions with nn_2
Train_Prediction_nn_2 <- Train_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# Predictions with nn_3
Train_Prediction_nn_3 <- Train_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# RMSE for Train -------------------------------------------------------
RMSE_Train_Prediction_nn_1 <- RMSE(Train_Prediction_nn_1,Train_NN$Price)
RMSE_Train_Prediction_nn_2 <- RMSE(Train_Prediction_nn_2,Train_NN$Price)
RMSE_Train_Prediction_nn_3 <- RMSE(Train_Prediction_nn_3,Train_NN$Price)
RMSE <- c(RMSE_Train_Prediction_nn_1,RMSE_Train_Prediction_nn_2,RMSE_Train_Prediction_nn_3)
# Rounding RMSE
RMSE <- round(RMSE,2)
# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")
Frame_RMSE <- cbind(Model,RMSE)
Frame_RMSE <- as.data.frame(Frame_RMSE)
RMSE_DATA <- flextable(Frame_RMSE) %>% fontsize(size = 8, part = "all")
(RMSE_DATA)Model | RMSE |
|---|---|
Neural A: 1 Hidden Layer, 2 Nodes | 984.02 |
Neural B: 1 Hidden Layer, 5 Nodes | 937.9 |
Neural C: 2 Hidden Layer, 5 Nodes | 857.48 |
# Predictions for Validation --------------------------------
# Predictions with nn
Validation_Prediction_nn_1 <- predict(nn_1,Test_NN_Preprocess)
# Predictions with nn_2
Validation_Prediction_nn_2 <- predict(nn_2,Test_NN_Preprocess)
# Predictions with nn_3
Validation_Prediction_nn_3 <- predict(nn_3,Test_NN_Preprocess)
# Back transform to Original Scale ---------------------------
# Predictions with nn
Validation_Prediction_nn_1 <- Validation_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# Predictions with nn_2
Validation_Prediction_nn_2 <- Validation_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# Predictions with nn_3
Validation_Prediction_nn_3 <- Validation_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)
# RMSE for Validation -------------------------------------------------------
RMSE_Validation_Prediction_1 <- RMSE(Validation_Prediction_nn_1, Test_NN$Price)
RMSE_Validation_Prediction_2 <- RMSE(Validation_Prediction_nn_2, Test_NN$Price)
RMSE_Validation_Prediction_3 <- RMSE(Validation_Prediction_nn_3, Test_NN$Price)
RMSE_Validation <- c(RMSE_Validation_Prediction_1,RMSE_Validation_Prediction_2,RMSE_Validation_Prediction_3)
# Rounding RMSE
RMSE_Validation <- round(RMSE_Validation,2)
# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")
Frame_RMSE_Validation <- cbind(Model,RMSE_Validation)
Frame_RMSE_Validation <- as.data.frame(Frame_RMSE_Validation)
RMSE_VALIDATION_DATA <- flextable(Frame_RMSE_Validation) %>% fontsize(size = 8, part = "all")
RMSE_VALIDATION_DATA <- set_header_labels(RMSE_VALIDATION_DATA, RMSE_Validation = "RMSE")
(RMSE_VALIDATION_DATA)Model | RMSE |
|---|---|
Neural A: 1 Hidden Layer, 2 Nodes | 1483.17 |
Neural B: 1 Hidden Layer, 5 Nodes | 1413.01 |
Neural C: 2 Hidden Layer, 5 Nodes | 1642.87 |
RMSE tend to decrease the more when complexify our neural network (Model C in that case). This indicates we are overfitting more and more our model to the training dataset.
The RMSE for the Validation Data is lower for the Neural Model B, and then it increases with the model C.
Model A and B are close, but Model A is not in a overfitting situation, we should decide wether we prefer good predictions with Model B and more overfitting to the training dataset, or Model A which give less accurate result but with more stability in future predictions since it is not overfitting such as Model C. In this case, I would prefer the Model B giving more accurate results but more prone to instability, thus choosing 1 Hidden Layers with 5 Nodes.
Github Repo for this Homework 3
Data Mining for Business Analytics: Concepts, Techniques, and Applications in R
Summarytools in R Markdown Documents
Data Science Plus - Fitting Neural Network in R
Backtransform with the Caret Package and Preprocess() Function